{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Tokenizer.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7fjcTlyE3WvR"
      },
      "source": [
        "#Tokenizers\n",
        "Copyright 2020 Denis Rothman, MIT License\n",
        "\n",
        "Reference 1 for word embedding:\n",
        "https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
        "\n",
        "Reference 2 for cosine similarity:\n",
        "SciKit Learn cosine similarity documentation\n",
        "\n",
        "***Upload text.txt before running the Notebook***"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JKJ8Saf6vR9b",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e329b785-128d-447a-b97d-eeaeb740e9e4"
      },
      "source": [
        "#@title Pre-Requisistes\n",
        "!pip install gensim==3.8.3\n",
        "import nltk\n",
        "nltk.download('punkt')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting gensim==3.8.3\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)\n",
            "\u001b[K     |████████████████████████████████| 24.2MB 1.5MB/s \n",
            "\u001b[?25hRequirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.4.1)\n",
            "Requirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.19.5)\n",
            "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (5.0.0)\n",
            "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.15.0)\n",
            "Installing collected packages: gensim\n",
            "  Found existing installation: gensim 4.0.1\n",
            "    Uninstalling gensim-4.0.1:\n",
            "      Successfully uninstalled gensim-4.0.1\n",
            "Successfully installed gensim-3.8.3\n",
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 1
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7o7EeDUUu0Sh"
      },
      "source": [
        "import math\n",
        "import numpy as np\n",
        "from nltk.tokenize import sent_tokenize, word_tokenize \n",
        "import gensim \n",
        "from gensim.models import Word2Vec \n",
        "import numpy as np\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "import matplotlib.pyplot as plt\n",
        "import warnings \n",
        "warnings.filterwarnings(action = 'ignore') "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1NRomrXEJOxJ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "360af319-8259-469e-babd-6eaa4cd6c714"
      },
      "source": [
        "#@title Word2Vec Tokenization\n",
        "#‘text.txt’ file \n",
        "sample = open(\"text.txt\", \"r\") \n",
        "s = sample.read() \n",
        "\n",
        "# processing escape characters \n",
        "f = s.replace(\"\\n\", \" \") \n",
        "\n",
        "data = [] \n",
        "# sentence parsing\n",
        "for i in sent_tokenize(f): \n",
        "\ttemp = [] \n",
        "\t# tokenize the sentence into words \n",
        "\tfor j in word_tokenize(i): \n",
        "\t\ttemp.append(j.lower())\n",
        "\tdata.append(temp)\n",
        "\n",
        "# Creating Skip Gram model \n",
        "model2 = gensim.models.Word2Vec(data, min_count = 1, size = 512,window = 5, sg = 1) \n",
        "print(model2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Word2Vec(vocab=11822, size=512, alpha=0.025)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YcC_3JLcJTgw"
      },
      "source": [
        "#@title Cosine Similarity\n",
        "def similarity(word1,word2):\n",
        "        cosine=False #default value\n",
        "        try:\n",
        "                a=model2[word1]\n",
        "                cosine=True\n",
        "        except KeyError:     #The KeyError exception is raised\n",
        "                print(word1, \":[unk] key not found in dictionary\")#False implied\n",
        "\n",
        "        try:\n",
        "                b=model2[word2]#a=True implied\n",
        "        except KeyError:       #The KeyError exception is raised\n",
        "                cosine=False   #both a and b must be true\n",
        "                print(word2, \":[unk] key not found in dictionary\")\n",
        "\n",
        "        if(cosine==True):\n",
        "                b=model2[word2]\n",
        "                # compute cosine similarity\n",
        "                dot = np.dot(a, b)\n",
        "                norma = np.linalg.norm(a)\n",
        "                normb = np.linalg.norm(b)\n",
        "                cos = dot / (norma * normb)\n",
        "\n",
        "                aa = a.reshape(1,512) \n",
        "                ba = b.reshape(1,512)\n",
        "                #print(\"Word1\",aa)\n",
        "                #print(\"Word2\",ba)\n",
        "                cos_lib = cosine_similarity(aa, ba)\n",
        "                #print(cos_lib,\"word similarity\")\n",
        "          \n",
        "        if(cosine==False):cos_lib=0;\n",
        "        return cos_lib"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fMfgbogHJVh-",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "3bdcc464-75fb-4f60-be5b-86b59af7809e"
      },
      "source": [
        "#@title Case 0: Words in text and dictionary\n",
        "word1=\"freedom\";word2=\"liberty\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Similarity [[0.38632965]] freedom liberty\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4B7vvKxOLbYC",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "837c3d9f-f64c-43be-f689-0c8ea9284c25"
      },
      "source": [
        "#@title Word(s) Case 1: Word not in text or dictionary\n",
        "word1=\"corporations\";word2=\"rights\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "corporations :[unk] key not found in dictionary\n",
            "Similarity 0 corporations rights\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qkFIC79JCQJp",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "2f53a434-ce87-47c2-c37c-375f9afac846"
      },
      "source": [
        "#@title Case 2: Noisy Relationship \n",
        "word1=\"etext\";word2=\"declaration\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Similarity [[0.51544815]] etext declaration\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mKVPiEi-GZtf",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "f2b3a8b0-63f1-4e0c-a242-ca709140bcb3"
      },
      "source": [
        "#@title Case 3: Rare words\n",
        "word1=\"justiciar\";word2=\"judgement\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Similarity [[0.2304948]] justiciar judgement\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0xZtAm3DHGJg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "7ac7a0f5-3509-4254-a7f2-c55b6fb8b46c"
      },
      "source": [
        "#@title Case 4: Replacing words\n",
        "word1=\"judge\";word2=\"judgement\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)\n",
        "\n",
        "word1=\"justiciar\";word2=\"judge\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Similarity [[0.20353234]] judge judgement\n",
            "Similarity [[0.37659135]] justiciar judge\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wOSID8kXHXWt",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "1a60f440-4d9c-4b68-e30f-60fe474cf458"
      },
      "source": [
        "#@title Case 5: Entailment\n",
        "word1=\"pay\";word2=\"debt\"\n",
        "print(\"Similarity\",similarity(word1,word2),word1,word2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Similarity [[0.54338676]] pay debt\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}